
do "E:/ReplicateBuild/02_code/00_environment/00_set_environment.do"

*Table of Contents
local appXwalk = 1
local appAccounts = 1
	local us_zip_data = 1
	local teach_geocodes = 1
local appEducation = 1
local appLicense = 1
local appExperience = 1
	local dataCheck = 1
	
********************************************************************************	
********************************************************************************
* Read in information applicants provided to the platform
********************************************************************************
********************************************************************************

********************************************************************************
* Read in applicant - NCERDC crosswalk
********************************************************************************
if `appXwalk' == 1 {
	********************************************************************************
	* Create dataset of all unique applicants with NCERDC IDs
	* Identifier: applicant_id
	********************************************************************************
	use "$rawdata/FOCAL/applicants_crosswalk", clear
	
	duplicates drop
	
	/* ren MISSINGVARNAME FOCALEmployeeID */ // original variable name identifies district
	
	ren (matchtype teachid) (matchtype_FOCAL_app ncerdc_id)
	replace applicant_id = trim(applicant_id)
	replace FOCALEmployeeID = . if FOCALEmployeeID==0
	
	sort applicant_id
	
	compress
	
	save "$basedata/FOCAL_applicants_crosswalk", replace
	
}

********************************************************************************
* Read in applicant account info, including location and FOCAL ID
********************************************************************************
if `appAccounts' == 1 {

	********************************************************************************
	* (tempfile) Generate lat/long and zipcode crosswalk
	* Identifier: zip5
	********************************************************************************
	if `us_zip_data' == 1 {
		import excel using "$auxdata/us-zip-code-latitude-and-longitude.xlsx", firstrow clear
		ren (Zip Latitude Longitude) (zip5 latitude longitude)
		keep zip5 latitude longitude

		
		sort zip5
		isid zip5
		
		tempfile tempZip
		save `tempZip', replace
	}

		********************************************************************************
		* (tempfile) Generate lat/long and zipcode crosswalk
		* Identifier: zip5
		********************************************************************************
		if `teach_geocodes' == 1{
			import excel "$rawdata/FOCAL/Applicant Tracking System Pub.xlsx", firstrow clear sheet("Applicants")
			
			
			replace zip = "" if zip=="NULL" // mix of 5- and 9-digit zips 
			gen zip9 = zip if strpos(zip,"-")>0 & length(zip)==10
			gen zip5 = substr(zip,1,5)
			/* replace zip5 = "XXX" if zip5=="XXX" */ // replacing actual value with XXX to preserve confidentiality
			destring zip5, replace
			
			keep zip5
			sort zip5
			duplicates drop

			merge 1:1 zip5 using `tempZip'
			drop if _m==2
			tab _m if zip5!=.
			tab zip5 if _m==1
			drop _m
			
			ren latitude teacher_app_latitude
			ren longitude teacher_app_longitude

			sort zip5
			

			tempfile tempGeo
			save `tempGeo', replace
		}
	
	********************************************************************************
	* Create dataset of all unique applicants with account info.
	* Identifier: applicant_id
	********************************************************************************
	import excel "$rawdata/FOCAL/Applicant Tracking System Pub.xlsx", firstrow clear sheet("Applicants")

	ren (creationdate Lastmodifieddatebyapplicant LastModifiedDate Applicationtype ///
	 RelativesinFOCAL FOCALEmployeeId CurrentEmployee AVAILABLE_DATE applicationtype) ///
	 (account_create_date last_modified_applicant_date last_modified_date app_position_type ///
	 FOCAL_relatives FOCALEmployeeID current_employee available_date_str applicant_type)

	* clean up some variables and convert strings to binary categories
	
	replace applicant_id = trim(applicant_id)

	gen instructional_applicant = (app_position_type=="Instructional")
	drop app_position_type

	replace zip = "" if zip=="NULL" // mix of 5- and 9-digit zips with a bunch of shorter errors?
	gen zip9 = zip if strpos(zip,"-")>0 & length(zip)==10
	gen zip5 = substr(zip,1,5)
	/* replace zip5 = "XXX" if zip5=="XXX" */ // replacing actual value with XXX to preserve confidentiality
	destring zip5, replace
	
	sort zip5
	merge n:1 zip5 using `tempGeo'
	assert _m!=2
	drop _m
/*
	replace teacher_app_latitude = XXX if zip5 == XXX
	replace teacher_app_longitude = XXX if zip5 == XXX
*/ 	// replacing actual value with XXX to preserve confidentiality

	destring areacode, ignore("(") replace

	gen has_FOCAL_relatives = (FOCAL_relatives=="Yes")
	drop FOCAL_relatives

	replace FOCALEmployeeID = . if FOCALEmployeeID==0

	gen current_FOCAL_employee = (current_employee=="Yes")
	drop current_employee

	replace available_date_str = "" if available_date_str=="NULL"
	gen available_date = date(available_date_str,"MDY")
	assert available_date!=. if available_date_str!=""
	drop available_date_str
	format available_date %d

	gen internal = strpos(applicant_type,"Internal")>0
	gen standard = strpos(applicant_type,"Standard")>0
	gen summer = strpos(applicant_type,"Summer")>0
	drop applicant_type

	sort applicant_id
	
	if `dataCheck' ==1 {
	* examine duplicates of applicant_id
		duplicates report applicant_id
		duplicates report applicant_id instructional_applicant // all duplicates are instructional
		preserve
		drop instructional_applicant
		duplicates report // all other variables are duplicated
		restore
	}
	* all duplicated applicant_id obs are co-registration of instructional and non-instructional accounts
	* collapse these
	ren instructional_applicant instructional_applicant_temp
	bys applicant_id: egen instructional_app = max(instructional_applicant_temp)
	bys applicant_id: egen noninstructional_app = min(instructional_applicant_temp)
	replace noninstructional_app = 1-noninstructional_app
	drop instructional_applicant_temp
	duplicates drop
	
	if `dataCheck' ==1 {
	* examine duplicates of FOCALEmployeeID
		sort FOCALEmployeeID applicant_id
		duplicates tag FOCALEmployeeID, gen(dupvar_FOCAL)
		tab dupvar_FOCAL if FOCALEmployeeID!=. // 6% have 1 duplicate, 0.2% have 2 duplicates
	}
	
	* merge on identifiers crosswalk
	ren FOCALEmployeeID FOCALEmployeeID_apps
	
	sort applicant_id
	merge 1:1 applicant_id using "$basedata/FOCAL_applicants_crosswalk"
	assert _m==3
	assert FOCALEmployeeID_apps==FOCALEmployeeID
	drop _m FOCALEmployeeID_apps
	
	sort applicant_id
	
	compress
	
	save "$basedata/FOCAL_applicant_account_info", replace
	
}





********************************************************************************
* Read in applicant's reported education, license, work experience
********************************************************************************

** Education
if `appEducation' == 1 {
	********************************************************************************
	* Create dataset of applicants' reported education
	* Identifier: applicant_id, educ_event_id
	********************************************************************************
	import excel "$rawdata/FOCAL/Applicant Tracking System Pub.xlsx", firstrow clear sheet("Education") allstring
	
	* fix two observations read in with shifted columns 
	replace degreeMajor = GraduationDate if J!=""
	replace GraduationDate = DegreeType if J!=""
	replace DegreeType = DateEnded if J!=""
	replace DateEnded = GPA if J!=""
	replace GPA = J if J!=""
	drop J

	* clean up variables and convert string variables to numeric
	foreach var in "DateRecordAdded" "GraduationDate" "DateEnded" {
		ren `var' `var'_str
		gen `var' = date(`var'_str,"MDY")
		assert `var'_str == "" if `var'==.
		drop `var'_str
		format `var' %d
	}
	replace collegename = trim(collegename)
	replace degreeMajor = trim(degreeMajor)
	replace DegreeType = trim(DegreeType)
	
	* fix GPA variable
	destring GPA, replace
	summ GPA, d
	replace GPA = . if GPA==0
	ren GPA gpa_allscales
	gen gpa = gpa_allscales if gpa_allscales<=4
	
	ren (degreeMajor DegreeType DateRecordAdded GraduationDate DateEnded) ///
	 (degree_major degree_type date_ed_record_added graduation_date date_ed_ended)
	 
	drop date_ed_record_added
	duplicates drop

	local sortvars = "applicant_id graduation_date date_ed_ended degree_type degree_major gpa college_id"
	
	
	if `dataCheck' ==1 {
	* examine duplicates of applicant_id
		duplicates report applicant_id // lots of duplicates
		duplicates tag `sortvars', gen(dupvar)
		assert dupvar==0
		drop dupvar

	}

	sort `sortvars'

	by applicant_id: gen educ_event_id = _n
	
	compress
	
	save "$basedata/FOCAL_applicant_education", replace
	
}


** License
if `appLicense' == 1 {
	********************************************************************************
	* Create dataset of applicants' reported license
	* Identifier: applicant_id, license_id
	********************************************************************************
	import excel "$rawdata/FOCAL/Applicant Tracking System Pub.xlsx", firstrow clear sheet("License") allstring
	
	* fix five observations read in with shifted columns 
	replace license_type_text = other_endorsement_name if V!=""
	replace other_endorsement_name = st_board_certified if V!=""
	replace st_board_certified = submit_date if V!=""
	replace submit_date = willing_to_teach if V!=""
	replace willing_to_teach = license_status_id if V!=""
	replace license_status_id = Licstatus if V!=""
	replace Licstatus = license_id if V!=""
	replace license_id = coverage if V!=""
	replace coverage = license_state if V!=""
	replace license_state = expiration_date if V!=""
	replace expiration_date = S if V!=""
	replace S = Licesnurearea if V!=""
	replace Licesnurearea = Licesnurearea2 if V!=""
	replace Licesnurearea2 = V if V!=""
	drop V
	
	ren (Licstatus Licesnurearea Licesnurearea2) (licstatus licensurearea licensurearea2)
	assert st_board_certified==S // S is just duplicated state board certification
	drop S
	
	drop admin_license issue_date coverage // no observations for these variables


	* clean up variables and convert string variables to numeric
	foreach var in "addeddate" "submit_date" "expiration_date" {
		replace `var' = "" if `var'=="00:00"
		ren `var' `var'_str
		gen `var' = date(`var'_str,"MDY")
		assert `var'_str == "" if `var'==.
		drop `var'_str
		format `var' %d
	}

	foreach var in "licensetype" "license_type_text" "other_endorsement_name" "licensurearea" "licensurearea2" {
		replace `var' = trim(`var')
	}
	
	foreach var in "st_board_certified" "willing_to_teach" {
		ren `var' `var'_str
		gen `var' = (`var'_str=="Y") if `var'_str!=""
		drop `var'_str
	}
	
	replace licstatus = "active" if strpos(licstatus,"Active")>0
	replace licstatus = "expired" if strpos(licstatus,"Expired")>0
	replace licstatus = "in_process" if strpos(licstatus,"In Process")>0
	replace licstatus = "inactive" if strpos(licstatus,"Inactive")>0
	replace licstatus = "will_apply" if strpos(licstatus,"Will apply upon Hire")>0
	drop license_status_id
	
	duplicates drop
	
	duplicates report applicant_id license_id // these uniquely identify observations
	

	sort applicant_id license_id

	
	compress
	
	save "$basedata/FOCAL_applicant_license", replace
	
}


** Experience
if `appExperience' == 1 {
	********************************************************************************
	* Create dataset of applicants' reported experience
	* Identifier: applicant_id, exp_event_id
	********************************************************************************
	import excel "$rawdata/FOCAL/Applicant Tracking System Pub.xlsx", firstrow clear sheet("Work Exp") allstring
	
	* fix one observation read in with shifted columns 
	gen shifted = strpos(wage,"Memphis, TN 38116")>0
	replace Position = wage if shifted
	replace wage = from_date if shifted
	replace from_date = to_date if shifted
	replace to_date = current_position if shifted
	replace current_position = may_contact if shifted
	replace may_contact = WorkDesc if shifted
	replace WorkDesc = "" if shifted
	drop shifted
	
	drop place // no observations for this variable


	* clean up variables and convert string variables to numeric
	foreach var in "from_date" "to_date" {
		ren `var' `var'_str
		gen `var' = date(`var'_str,"MDY")
		assert `var'_str == "" if `var'==.
		drop `var'_str
		format `var' %d
	}
	
	foreach var in "ExpType" "Position" "wage" "WorkDesc" {
		replace `var' = trim(`var')
	}
	
	foreach var in "current_position" "may_contact" {
		ren `var' `var'_str
		gen `var' = (`var'_str=="Y") if `var'_str!=""
		drop `var'_str
	}
	
	destring wage, replace
	ren wage wage_allformats
	
	* note: ExpType only takes on a limited number of values
	tab ExpType
	
	duplicates drop

	local sortvars = "applicant_id from_date to_date current_position Position ExpType WorkDesc wage"
	
	
	if `dataCheck' ==1 {
	* examine duplicates of applicant_id
		duplicates report applicant_id // lots of duplicates
		duplicates tag `sortvars', gen(dupvar)
		assert dupvar==0
		drop dupvar

	}

	sort `sortvars'
	
	ren (ExpType WorkDesc) (exp_type work_desc)

	by applicant_id: gen exp_event_id = _n
	
	compress
	
	save "$basedata/FOCAL_applicant_experience", replace
	
}
